import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import pycaret
data = pd.read_csv("https://raw.githubusercontent.com/amankharwal/Website-data/master/water_potability.csv")
data.head()
data = data.dropna()
#remove the NAN
data.isnull().sum()
#check the NAN rows
ph 0 Hardness 0 Solids 0 Chloramines 0 Sulfate 0 Conductivity 0 Organic_carbon 0 Trihalomethanes 0 Turbidity 0 Potability 0 dtype: int64
plt.figure(figsize=(15, 10))
sns.countplot(data.Potability)
plt.title("Distribution of Unsafe and Safe Water")
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
import plotly.express as px
data = data
figure = px.histogram(data, x = "ph",
color = "Potability",
title= "Factors Affecting Water Quality: PH")
figure.show()
figure = px.histogram(data, x = "Hardness",
color = "Potability",
title= "Factors Affecting Water Quality: Hardness")
figure.show()
figure = px.histogram(data, x = "Solids",
color = "Potability",
title= "Factors Affecting Water Quality: Solids")
figure.show()
figure = px.histogram(data, x = "Chloramines",
color = "Potability",
title= "Factors Affecting Water Quality: Chloramines")
figure.show()
figure = px.histogram(data, x = "Sulfate",
color = "Potability",
title= "Factors Affecting Water Quality: Sulfate")
figure.show()
figure = px.histogram(data, x = "Conductivity",
color = "Potability",
title= "Factors Affecting Water Quality: Conductivity")
figure.show()
figure = px.histogram(data, x = "Organic_carbon",
color = "Potability",
title= "Factors Affecting Water Quality: Organic Carbon")
figure.show()
figure = px.histogram(data, x = "Trihalomethanes",
color = "Potability",
title= "Factors Affecting Water Quality: Trihalomethanes")
figure.show()
figure = px.histogram(data, x = "Turbidity",
color = "Potability",
title= "Factors Affecting Water Quality: Turbidity")
figure.show()
correlation = data.corr()
correlation["ph"].sort_values(ascending=False)
ph 1.000000 Hardness 0.108948 Organic_carbon 0.028375 Trihalomethanes 0.018278 Potability 0.014530 Conductivity 0.014128 Sulfate 0.010524 Chloramines -0.024768 Turbidity -0.035849 Solids -0.087615 Name: ph, dtype: float64
from pycaret.classification import *
clf = setup(data, target = "Potability", silent = True, session_id = 786)
compare_models()
!pip install pycaret
!pip install --upgrade pip
model = create_model("rf")
predict = predict_model(model, data=data)
predict.head()